////////////////////////////////////////////////////////////////////////
// Immigrant panel
//
// We are using the LISS and immigrant data. The data are publically available
//
// https://www.lissdata.nl/lissdata/
//
// The categorization of the final comments are in Cat_A (immigrant) 
// and Cat_L (LISS).  This code reproduces the output after merging all 
// relevant variables. The merges involve many surveys and it will be difficult
// to exactly reproduce the input data sets we used (but are not allowed to 
// release publically). We believe the code is useful nonetheless because it 
// shows detailed analysis decisions.
//
// requires demographic files for 2010/03 through 2013/09 and 
// the corresponding survey data saved as liss_eva_data_merged.dta
//
// Note: the seed was not fixed. Therefore results will be similar,
// but not identical.
////////////////////////////////////////////////////////////////////////

// modify the merged file to get proper id variables etc.

version 13.1
clear

use liss_eva_data_merged.dta

// combine two vars
gen nomem_encr= nomem_encr_A
replace nomem_encr =nomem_encr_L if nomem_encr_A==.

// drop 77 obs with no respondent number
drop if nomem_encr==.

// comment indicator
gen comment_flag= eva_comment!=""
// few comments are nonsense comments "/" "geen" etc.
count  if  strlen(eva_comment)<5 & comment_flag

// remove leading and trailing blanks
replace questionnaire=trim(questionnaire)

// remove duplicates
duplicates tag nomem_encr_A-endtime , gen(dups) 
drop if dups==1
drop dups

////////////////////////////////////////////////////////////////////////
// Reduce dataset by year_month to largest questionnaire per month

drop if nomem_encr_A==.

// convert to date format
tostring year_month, generate(year_month_str)
generate date = date(year_month_str,"YM")
replace date = mofd(date)
format %tm date

// drop duplicate responses for each questionnaire, keeping the last response
gen mult_resp = .
bysort nomem_encr date questionnaire: ///
replace mult_resp=1 if date[_n+1]==date[_n] & questionnaire[_n+1]==questionnaire[_n]
drop if mult_resp==1

// remove all but largest survey for each month generate 
// questionnaire_str = questionnaire + year_month_str
gen N=.
bysort questionnaire date: replace N=_N
bysort date: egen  max_questionnaire=max(N)
bysort date: drop if N != max_questionnaire

// drop questionnaires that have identical N in same month, 
// keeping the first survey sent
bysort date (questionnaire): drop if _n> max_questionnaire

// define indicator variable for response next_month
gen gap = .
bysort nomem_encr (date): replace gap=date[_n+1]-date[_n]
gen attrit=.
replace attrit=1 if gap>1
replace attrit=. if date==mofd(date("201309","YM"))
replace attrit=0 if attrit==.

keep year_month nomem_encr comment_flag date attrit questionnaire nomem_encr_A
save imm_reduced_data.dta

////////////////////////////////////////////////////////////////////////
// Merge demographics

//Extract survey respondents
gen nom_encr=.
gen yr_mth=.
bysort nomem_encr: replace nom_encr=nomem_encr if _n==1
bysort year_month: replace yr_mth=year_month if _n==1
expand 2 if nom_encr!=. & yr_mth!=.
list nom_encr yr_mth if nom_encr!=. & yr_mth!=.
gen nom=nom_encr if nomem_encr!=. & yr_mth!=.
bysort yr_mth: replace yr_mth=. if _n==2
bysort nom_encr: replace nom_encr=. if _n==1 & nom_encr==nom
drop if nom_encr==.
drop nomem_encr
gen nomem_encr=nom_encr
drop nom_encr yr_mth year_month
expand 43 if nomem_encr!=.
sort nomem_encr
egen year_month = fill(201003/201012 201101/201112 201201/201212 201301/201309 ///
	201003/201012 201101/201112 201201/201212 201301/201309)

duplicates tag nomem_encr year_month, generate(dups)
bysort nomem_encr year_month: drop if dups==1 & _n>1
drop dups

save 1survey_per_month_A.dta

//Add demographic values to the respondent file
local path="immigrant_avars"
gen wave=year_month

fs  immigrant_avars\*.dta
foreach f in `r(files)' {
	di "`f'" 
	merge m:1 nomem_encr wave using `path'/`f' ,  update ///
		keepusing(geslacht leeftijd lftdcat burgstat nettoink nettocat oplcat) 
	tab _merge
	drop if _merge==2
	cap drop _merge
}

save 1survey_per_month_A.dta, replace

//merge the demographic variables to the survey data
clear all
use imm_reduced_data.dta

merge 1:1 nomem_encr year_month using 1survey_per_month_A.dta
keep if _merge==1 | _merge==3
drop _merge

//Gender (geslacht)
gen male = geslacht
replace male=0 if male==2

//Age (leeftijd and lftdcat)
gen age_int = leeftijd
gen age_cat = lftdcat
gen age_14 = age_cat==1
gen age_15_24 = age_cat==2
gen age_25_34 = age_cat==3
gen age_35_44 = age_cat==4
gen age_45_54 = age_cat==5
gen age_55_64 = age_cat==6
gen age_65 = age_cat==7

//Civil status (burgstat)
gen civil_status = burgstat
replace civil_status=2 if burgstat==2 | burgstat==3 | burgstat==4
replace civil_status=3 if burgstat==5
gen married = civil_status==1
gen divorced = civil_status==2
gen single = civil_status==3

//Net income (nettoink and nettocat)
gen net_income = nettocat
replace net_income=. if net_income==13 | net_income==14
forvalues i = 0/12 {
	gen net_income_`i' = net_income==`i'
}
gen net_income_com = 0
replace net_income_com = 1 if net_income>7 & net_income<13

//Education (oplcat)
gen edu_HS=0
gen edu_less_HS=0
gen edu_college=0
gen edu_part_college=0
replace edu_HS=1 if oplcat==3
replace edu_less_HS=1 if oplcat==1 | oplcat==2
replace edu_college=1 if oplcat==5 | oplcat==6
replace edu_part_college=1 if oplcat==4

drop nom wave geslacht leeftijd lftdcat burgstat nettoink nettocat oplcat

////////////////////////////////////////////////////////////////////////
// Merge comment categories

sort nomem_encr_A year_month

// merge with the comment category file
merge 1:1 nomem_encr_A year_month using Cat_A.dta
keep if _merge==1 | _merge==3
drop _merge

// Individual 1 is the expert, use all categories from this individual, 
//  then individual 2's categories
gen comment_cat=cat_1
replace comment_cat=cat_2 if comment_cat==""

gen cat_difficult = comment_cat=="difficult"
gen cat_error = comment_cat=="error"
gen cat_long = comment_cat=="long"
gen cat_neutral = comment_cat=="neutral"
gen cat_not_apply = comment_cat=="not_apply"
gen cat_negative = comment_cat=="other"
replace cat_negative = 1 if comment_cat=="negative"
gen cat_positive = comment_cat=="positive"
gen cat_trivial = comment_cat=="trivial"
gen cat_unclear = comment_cat=="unclear"
gen com_negative = comment_cat=="difficult" | comment_cat=="error" | ///
	comment_cat=="long" | comment_cat=="not_apply" | comment_cat=="trivial" | ///
	comment_cat=="unclear"

generate Difficult1=1 if cat_1=="difficult"
generate Error1=1 if cat_1=="error"
generate Long1=1 if cat_1=="long"
generate Neutral1=1 if cat_1=="neutral"
generate Not_apply1=1 if cat_1=="not_apply"
generate Negative1=1 if cat_1=="other"
generate Positive1=1 if cat_1=="positive"
generate Trivial1=1 if cat_1=="trivial"
generate Unclear1=1 if cat_1=="unclear"
replace Difficult1=0 if missing(Difficult1)==1
replace Error1=0 if missing(Error1)==1
replace Long1=0 if missing(Long1)==1
replace Neutral1=0 if missing(Neutral1)==1
replace Not_apply1=0 if missing(Not_apply1)==1
replace Negative1=0 if missing(Negative1)==1
replace Positive1=0 if missing(Positive1)==1
replace Trivial1=0 if missing(Trivial1)==1
replace Unclear1=0 if missing(Unclear1)==1

generate Difficult2=1 if cat_2=="difficult"
generate Error2=1 if cat_2=="error"
generate Long2=1 if cat_2=="long"
generate Neutral2=1 if cat_2=="neutral"
generate Not_apply2=1 if cat_2=="not_apply"
generate Negative2=1 if cat_2=="negative"
generate Positive2=1 if cat_2=="positive"
generate Trivial2=1 if cat_2=="trivial"
generate Unclear2=1 if cat_2=="unclear"
replace Difficult2=0 if missing(Difficult2)==1
replace Error2=0 if missing(Error2)==1
replace Long2=0 if missing(Long2)==1
replace Neutral2=0 if missing(Neutral2)==1
replace Not_apply2=0 if missing(Not_apply2)==1
replace Negative2=0 if missing(Negative2)==1
replace Positive2=0 if missing(Positive2)==1
replace Trivial2=0 if missing(Trivial2)==1
replace Unclear2=0 if missing(Unclear2)==1

//Comment length
replace ntoken = 0 if ntoken==.
gen log_words = log(ntoken+1)

//create a capped comment length variable at 95-th percentile
gen cap_ntoken = ntoken
replace cap_ntoken=88 if cap_ntoken>88 //95-th percentile = 88 words

//Remove all comments that are uncategorised
drop if comment_flag==1 & comment_cat==""

save imm_reduced_data.dta, replace


////////////////////////////////////////////////////////////////////////
// GEE final analysis

drop if cat_trivial==1

// 1. Comment length without comment type
xtset nomem_encr_A
xtgee attrit comment_flag cap_ntoken ///
male age_15_24 age_25_34 age_35_44 age_55_64 age_65 ///
divorced single net_income_0 net_income_1 net_income_2 ///
net_income_3 net_income_5 net_income_6 net_income_7 net_income_com ///
edu_less_HS edu_part_college edu_college, ///
corr(exch) link(logit) family(binomial 1) vce(robust)

matrix R=e(R)
matrix list R

// 2. Comment type with comment length, no interactions
xtset nomem_encr_A
xtgee attrit cap_ntoken cat_difficult cat_error cat_long cat_not_apply ///
cat_negative cat_positive cat_unclear cat_neutral ///
male age_15_24 age_25_34 age_35_44 age_55_64 age_65 ///
divorced single net_income_0 net_income_1 net_income_2 ///
net_income_3 net_income_5 net_income_6 net_income_7 net_income_com ///
edu_less_HS edu_part_college edu_college, ///
corr(exch) link(logit) family(binomial 1) vce(robust)

matrix R=e(R)
matrix list R

////////////////////////////////////////////////////////////////////////
// SIMEX final analysis

drop if cat_trivial==1

// 1. Comment type with comment length, no interactions
simex (attrit = ntoken_cap male age_15_24 age_25_34 age_35_44 age_55_64 age_65 ///
divorced single net_income_0 net_income_1 net_income_2 ///
net_income_3 net_income_5 net_income_6 net_income_7 net_income_com ///
edu_less_HS edu_part_college edu_college ) ///
(w1: Difficult1 Difficult2) (w2: Error2 Error2) (w3: Long1 Long2) ///
(w4: Not_apply1 Not_apply2) (w5: Unclear1 Unclear2) (w6: Negative1 Negative2) ///
(w7: Positive1 Positive2) (w8: Neutral1 Neutral2), ///
bstrap family(binomial) link(logit) robust
